import requests
from bs4 import BeautifulSoup
import pymysql
import os

web = 'http://www.thwys631.xyz'

def get(url, time=1):
    try:
        resp = requests.get(url, timeout=10)
        bsobj = BeautifulSoup(resp.content, 'lxml')
        return bsobj
    except:
        if(time == 10):
            print("ERROR!!!")
            print(url)
            return None
        else:
            print("第", time, "次尝试：", url)
            return get(url,time+1)

def story_save(cate, date, title, url):
    bsobj = get(url)
    if(bsobj != None):
        a_list = bsobj.select('.main-content > p')
        content = ''
        if(len(a_list) > 0):
            content = str(a_list[0]).replace('"', "'")
        else:
            a_list = bsobj.select('.main-content')
            content = str(a_list[0]).replace('"', "'").replace('href', "href_xxx")
        # print(content)
        # 打开数据库连接
        db = pymysql.connect(host="localhost", port=3306, user="root", password="123456", database="y_site")

        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        sql = 'INSERT INTO `story2`(`cate`, `title`, `url`, `content`) VALUES ("' +cate+ '", "'+title+'", "'+url+'", "'+content+'");'
        # print(cate, date, title, url)
        try:
            cursor.execute(sql)
            db.commit()
        except:
            db.rollback()

        db.close()

def story(cate,url,last_page=False):
    bsobj = get(url)
    if (bsobj != None):
        a_list = bsobj.select('#colList > ul')
        for a in a_list[0].find_all('li'):
            # print(a)
            date = a.find('a').find('span').text
            text = a.find('a').find('h2').text
            href = web+a.find('a').get('href')
            if(href.endswith('index.html')):
                continue

            # 打开数据库连接
            db = pymysql.connect(host="localhost", port=3306, user="root", password="123456", database="y_site")

            # 使用 cursor() 方法创建一个游标对象 cursor
            cursor = db.cursor()
            sql = 'select * from `story2` where `title` = "' + text + '"'
            cursor.execute(sql)
            results = cursor.fetchall()
            db.close()
            if (len(results) > 0):
                print("pass:" + text)
            else:
                print('保存数据：',cate, date, text, href)
                story_save(cate, date, text, href)

        if(last_page):
            return None
        page_list = bsobj.select('.pagination')
        next_page = ''
        end_page = ''
        page_href = ''
        for a in page_list[0].find_all('a'):
            # print(a)
            if("下一页" == a.text):
                next_page = web+a.get('href')
                page_href = next_page
            if ("尾页" == a.text):
                end_page = web+a.get('href')

        print()
        print("页码：", cate, next_page, end_page)

        if(next_page == end_page):
            story(cate, page_href, last_page=True)
            return None
        story(cate, page_href)

if __name__ == '__main__':
    url = web
    bsobj = get(url)
    if (bsobj != None):
        cates = ('在线点播','多情小说','激情图区')
        a_list = bsobj.select('#menu > div:nth-child(1)')
        uls = a_list[0].find_all('dl')
        for ul in uls:
            lis = ul.find_all('dt')
            if(lis[0].text == '小说'):
                uls = ul.find_all('dd')
                for ul in uls:
                    lis = ul.find_all('a')

                    for li in lis:
                        text = li.text
                        href = li.get('href')

                        print(text, url + href)
                        story(text, url + href)





